############################################################################
#####################        function define     ###########################
############################################################################
library(amap)
library(gplots)
error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
  if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
    stop("vectors must be same length")
  arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)
}

TopThreeRelatedRepeats <- function(x){
	return (names(x[order(x,decreasing=T)][1:5]))
}

GeneExpressionPattern <- function(x1,x2,gene,main){
	min <- min(cbind(x1[gene,],x2[gene,]))
	max <- max(cbind(x1[gene,],x2[gene,]))
	plot(seq(length(x1[gene,])),x1[gene,],main=main,col=cccol[1],type='l',ylab="log2(fpkm+1)",xlab="",lwd=2,ylim=c(min,max))
	points(seq(length(x2[gene,])),x2[gene,],col=cccol[2],type='l',lwd=2)

}

cccol <- c("#CE0013","#16557A","#C7A609","#87C232","#64C0AB","#A14C94","#15A08C","#8B7E75","#1E7CAF","#EA425F","#46489A","#E50033","#0F231F","#1187CD")
cccol50 <- c("#CE001350","#16557A50","#C7A60950","#87C23250","#64C0AB50","#A14C9450","#15A08C50","#8B7E7550","#1E7CAF50","#EA425F50","#46489A50","#E5003350","#0F231F50","#1187CD50")

############################################################################
#####################             data           ###########################
############################################################################

############ 2nd naive RNAseq
logfpkm2nd <- read.table("../data/2nd.reprogramming.lg2.all.fpkm.txt",header=T,row.names=1)
n_path <- c("hiF_r1","hiF_r2","he0_r1","he0_r2","he2_r1","he2_r2","he6_r1","he6_r2","n8_r1","n8_r2","n8_r3","n12_r1","n12_r2","n14_r1","n14_r2","n14_r3","n20_r1","n20_r2","n20_r3","n24p_r1","n24p_r2","n24m_r1","n24m_r2","niPS_r1","niPS_r2")
nData_tmp <- logfpkm2nd[,n_path]
nfpkm2nd <- 2**nData_tmp - 1
n_time_point <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24pdox","n24mdox","niPS")
n_label <- c("hiF-T","0d","2d","6d","8d","12d","14d","20d","24d+dox","24d-dox","niPSC-T")
nData2ndfpkm <- cbind(apply(nfpkm2nd[,1:2],1,mean),apply(nfpkm2nd[,3:4],1,mean),apply(nfpkm2nd[,5:6],1,mean),apply(nfpkm2nd[,7:8],1,mean),apply(nfpkm2nd[,9:11],1,mean),apply(nfpkm2nd[,12:13],1,mean),apply(nfpkm2nd[,14:16],1,mean),apply(nfpkm2nd[,17:19],1,mean),apply(nfpkm2nd[,20:21],1,mean),apply(nfpkm2nd[,22:23],1,mean),apply(nfpkm2nd[,24:25],1,mean))
colnames(nData2ndfpkm) <- n_time_point
rownames(nData2ndfpkm) <- rownames(nfpkm2nd)

############ 2nd primed RNAseq
pData2ndfpkm <- read.table("../data/paper.primed.fpkm.txt",header=T,row.names=1)

nData <- log2(nData2ndfpkm+1)
pData <- log2(pData2ndfpkm+1)
common_time_point <- c("hiF-T","2d","6d","8d","14d","20d","24d+dox","24d-dox","iPSC-T")
colnames(nData2ndfpkm) <- n_time_point
rownames(nData2ndfpkm) <- rownames(nfpkm2nd)

common_time_point <- c("hiF-T","2d","6d","8d","14d","20d","24d+dox","24d-dox","niPSC-T/piPSC-T")

KRAB_ZNF <- as.vector(read.table("../data/KRAB_ZNF.genes")[,1])
KRAB_ZNF <- c(intersect(intersect(KRAB_ZNF,rownames(nData)),rownames(pData)),"ZNF534")

plot_matrix <- nData[KRAB_ZNF,]

# ZNF repeats
ZNF_repeats <- read.table("../data/ZNF_repeats.txt",row.names=1,header=T)
common_ZNF <- intersect(KRAB_ZNF,row.names(ZNF_repeats))
ZNF_repeats_colnames <- colnames(read.table("../data/ZNF_repeats_standardlise_colnames.txt",header=T))
colnames(ZNF_repeats) <- ZNF_repeats_colnames

ZNF_repeats <- ZNF_repeats[common_ZNF,]

# repeats expression
fpkm_row_names <- as.vector(read.table("../data/fpkm_expression_names.txt")[,1])
he0 <- 1:2; he2 <- 3:4; he6 <- 5:6; hiF <- 7:8; n10 <- 9; n12 <- 10:11; n14 <- c(12:13,25); n20 <- c(14:15,26); n24m <- 16:17; n24p <- 18:19; n8 <- c(20:21,24); niPS <- 22:23
naive_repeats_fpkm <- read.table("../data/naive_repeats_fpkm.txt",row.names=1,header=T)
naiveRepeatsAverageFpkm <- cbind(apply(naive_repeats_fpkm[hiF],1,mean),apply(naive_repeats_fpkm[he0],1,mean),apply(naive_repeats_fpkm[he2],1,mean),
    apply(naive_repeats_fpkm[he6],1,mean),apply(naive_repeats_fpkm[n8],1,mean),apply(naive_repeats_fpkm[n12],1,mean),
    apply(naive_repeats_fpkm[n14],1,mean),apply(naive_repeats_fpkm[n20],1,mean),apply(naive_repeats_fpkm[n24m],1,mean),apply(naive_repeats_fpkm[n24p],1,mean),
    apply(naive_repeats_fpkm[niPS],1,mean))
naiveRepeatsSD <- cbind(apply(naive_repeats_fpkm[hiF],1,sd),apply(naive_repeats_fpkm[he0],1,sd),apply(naive_repeats_fpkm[he2],1,sd),
    apply(naive_repeats_fpkm[he6],1,sd),apply(naive_repeats_fpkm[n8],1,sd),apply(naive_repeats_fpkm[n12],1,sd),
    apply(naive_repeats_fpkm[n14],1,sd),apply(naive_repeats_fpkm[n20],1,sd),apply(naive_repeats_fpkm[n24m],1,sd),apply(naive_repeats_fpkm[n24p],1,sd),
    apply(naive_repeats_fpkm[niPS],1,sd))
colnames(naiveRepeatsAverageFpkm) <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24m","n24p","niPS")
colnames(naiveRepeatsSD) <- c("hiF","he0","he2","he6","n8","n12","n14","n20","n24m","n24p","niPS")
AmplifyNaiveRepeatsAverageFpkm <- naiveRepeatsAverageFpkm * 1e3
AmplifyNaiveRepeatsSD <- naiveRepeatsSD * 1e3
rownames(naiveRepeatsAverageFpkm) <- fpkm_row_names
rownames(naiveRepeatsSD) <- fpkm_row_names

LogNaiveRepeatsAverageFpkm <- log2(AmplifyNaiveRepeatsAverageFpkm+1)

p2 <- 1:2; p5 <- 3:4; p8 <- 5:6; p10 <- 7:8; p14 <- 9:10; p20 <- 11:12; p24m <- 13:16; p24p <- 17:18; hiFT <- 19:22; piPS <- 23:26
primed_repeats_fpkm <- read.table("/mnt/Storage3/home/zhaocc/Work/1.naiveiPS/result/0.mapping/Repeats/primed/primed_repeats_fpkm.txt",row.names=1,header=T)
primedRepeatsAverageFpkm <- cbind(apply(primed_repeats_fpkm[hiFT],1,mean),apply(primed_repeats_fpkm[p2],1,mean),apply(primed_repeats_fpkm[p5],1,mean),
	apply(primed_repeats_fpkm[p8],1,mean),apply(primed_repeats_fpkm[p10],1,mean),apply(primed_repeats_fpkm[p14],1,mean),apply(primed_repeats_fpkm[p20],1,mean),
	apply(primed_repeats_fpkm[p24m],1,mean),apply(primed_repeats_fpkm[p24p],1,mean),apply(primed_repeats_fpkm[piPS],1,mean))
primedRepeatsSD <- cbind(apply(primed_repeats_fpkm[hiFT],1,sd),apply(primed_repeats_fpkm[p2],1,sd),apply(primed_repeats_fpkm[p5],1,sd),
	apply(primed_repeats_fpkm[p8],1,sd),apply(primed_repeats_fpkm[p10],1,sd),apply(primed_repeats_fpkm[p14],1,sd),apply(primed_repeats_fpkm[p20],1,sd),
	apply(primed_repeats_fpkm[p24m],1,sd),apply(primed_repeats_fpkm[p24p],1,sd),apply(primed_repeats_fpkm[piPS],1,sd))
colnames(primedRepeatsAverageFpkm) <- c("hiFT","p2","p5","p8","p10","p14","p20","p24m","p24p","piPS")
colnames(primedRepeatsSD) <- c("hiFT","p2","p5","p8","p10","p14","p20","p24m","p24p","piPS")
rownames(primedRepeatsAverageFpkm) <- fpkm_row_names
rownames(primedRepeatsSD) <- fpkm_row_names
AmplifyPrimedRepeatsAverageFpkm <- primedRepeatsAverageFpkm * 1e3
AmplifyPrimedRepeatsSD <- primedRepeatsSD * 1e3

LogPrimedRepeatsAverageFpkm <- log2(AmplifyPrimedRepeatsAverageFpkm+1)
###################################################################################################
###############################         Cluster and plot          #################################
###################################################################################################

# K9 cluster based on peaks
k <- 8
set.seed(199)
km <- kmeans(plot_matrix,k)
# km <- Kmeans(plot_matrix,k,method = "correlation")

plot_matrix_sort <- c()
clusterBoundary <- c()
tmp_boundary_sum <- 0
for (each in seq(k)){
    modGenes = names(which(km$cluster==each))
    plot_matrix_sort <- rbind(plot_matrix_sort,plot_matrix[names(which(km$cluster==each)),])
    plot_matrix_primed <- rbind(plot_matrix_primed,pData[names(which(km$cluster==each)),])
    tmp_boundary_sum <- tmp_boundary_sum+length(which(km$cluster==each))
    clusterBoundary <- c(clusterBoundary,tmp_boundary_sum)
    write.table(modGenes,file=paste(k,"_Cluster_",each,"_ZNF.txt",sep=""),quote=F,col.names=F,row.names=F)
}

all_exp <- c(as.matrix(plot_matrix))
zmax <- quantile(all_exp,0.99)
zmin <- quantile(all_exp,0.01)
plot_matrix[plot_matrix<zmin] <- zmin
plot_matrix[plot_matrix>zmax] <- zmax

pdf(file = paste(k,"_ZNFExpressionHeatmapInNaivePath.pdf",sep=""), width = 4, height = 9);
par(oma=c(0.5,0.5,0.5,0.5),mar=c(2,2,8,2))
ColorRamp <- colorRampPalette(c("white","lightblue","#0E6CBE","darkblue"), bias=1)(10000)   #color list
ColorLevels <- seq(to=zmax,from=zmin, length=10000)   #number sequence
# layout(matrix(c(1,1,1,1,1,1,1,2),nrow=8,ncol=1,byrow=T))
image(1:ncol(plot_matrix_sort), 1:nrow(plot_matrix_sort), t(plot_matrix_sort), xaxt="n",yaxt="n", col=ColorRamp, xlab="", ylab="")
abline(h=clusterBoundary+0.5,lwd=2,lty=2)
axis(side=3,1:ncol(plot_matrix_sort),labels=n_label,cex.axis=1.2,las=2);box()
dev.off()
pdf(file = paste(k,"_ZNFExpressionHeatmapInNaivePath_legend.pdf",sep=""), width = 4, height = 2);
par(mar=c(4,4,5,4))
image(ColorLevels,1,matrix(data=ColorLevels, nrow=length(ColorLevels),ncol=1),col=ColorRamp, xlab="",ylab="",cex.axis=1,xaxt="n",yaxt="n",useRaster=T)
axis(side=1,c(zmin,(zmin+zmax)/2,zmax),labels=c(round(zmin,2),round((zmin+zmax)/2,2),round(zmax,2)))
dev.off()

# pdf(file = paste(k,"_ZNFExpressionHeatmapInPrimedPath.pdf",sep=""), width = 4, height = 9);
# par(oma=c(0.5,0.5,0.5,0.5),mar=c(2,2,8,2))
# ColorRamp <- colorRampPalette(c("white","lightpink","brown1","darkred"), bias=1)(10000)   #color list
# ColorLevels <- seq(to=zmax,from=zmin, length=10000)   #number sequence
# # layout(matrix(c(1,1,1,1,1,1,1,2),nrow=8,ncol=1,byrow=T))
# image(1:ncol(plot_matrix_primed), 1:nrow(plot_matrix_primed), t(plot_matrix_primed), xaxt="n", col=ColorRamp, xlab="", ylab="")
# abline(h=clusterBoundary+0.5,lwd=2,lty=2)
# axis(side=3,1:ncol(plot_matrix_primed),labels=colnames(plot_matrix),cex.axis=1.2,las=2);box()
# dev.off()
# pdf(file = paste(k,"_ZNFExpressionHeatmapInPrimedPath_legend.pdf",sep=""), width = 4, height = 2);
# par(mar=c(4,4,5,4))
# image(ColorLevels,1,matrix(data=ColorLevels, nrow=length(ColorLevels),ncol=1),col=ColorRamp, xlab="",ylab="",cex.axis=1,xaxt="n",yaxt="n",useRaster=T)
# axis(side=1,c(zmin,(zmin+zmax)/2,zmax),labels=c(round(zmin,2),round((zmin+zmax)/2,2),round(zmax,2)))
# dev.off()

# ####### related repeats for each cluster
# top_3_repeats <- apply(ZNF_repeats,1,TopThreeRelatedRepeats)

# RepeatsExpressionPattern <- function(x,genes,main,col){
# 	ymax <- max(x[genes,])
# 	ymin <- min(x[genes,])
# 	plot(seq(ncol(x)),seq(ncol(x)),type="n",xlab="",ylab="log2(Expression*1000+1)",main=main,ylim=c(ymin,ymax))
# 	for (each in genes){
# 		points(seq(ncol(x)),x[each,],col=col,type="l",lwd=2)
# 	}
# }

# RepeatsExpressionBoxplot <- function(x,genes,main,col){
# 	boxplot(x[genes,],main=main,col=col,outline = F,las=2)
# }

# pdf("RepeatsExpressionPattern.pdf",width=9,height=5)
# par(mfrow=c(2,4))
# for (each in seq(k)){
#     ZNF <- intersect(colnames(top_3_repeats),names(which(km$cluster==each)))
#     repeats <- intersect(unique(as.vector(top_3_repeats[,ZNF])),fpkm_row_names)
# 	RepeatsExpressionPattern(LogNaiveRepeatsAverageFpkm,repeats,paste("Cluster",each,sep=""),cccol50[each])
# }
# dev.off()

# pdf("RepeatsExpressionBoxplot.pdf",width=9,height=5)
# par(mfrow=c(2,4))
# for (each in seq(k)){
#     ZNF <- intersect(colnames(top_3_repeats),names(which(km$cluster==each)))
#     repeats <- intersect(unique(as.vector(top_3_repeats[,ZNF])),fpkm_row_names)
# 	RepeatsExpressionBoxplot(LogNaiveRepeatsAverageFpkm,repeats,paste("Cluster",each,sep=""),cccol[each])
# }
# dev.off()

# for (each in seq(k)){
#     pdf(paste(each,"_ExpressionChangePattern.pdf"),width=4,height=3)
#     ZNF <- names(which(km$cluster==each))
#     for (each_gene in ZNF){
# 	    GeneExpressionPattern(nData,pData,each_gene,each_gene)
#     }
#     dev.off()
# }